import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors as colors
import plotly.io as pio
#pio.renderers.default = 'png' # Forces static images for Plotly outputs
df=pd.read_csv("sales_forecasting.csv", encoding="latin1")
df.head()
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | ... | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | CA-2016-152156 | 11/8/2016 | 11/11/2016 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | 2 | CA-2016-152156 | 11/8/2016 | 11/11/2016 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | 3 | CA-2016-138688 | 6/12/2016 | 6/16/2016 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | ... | 90036 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | 4 | US-2015-108966 | 10/11/2015 | 10/18/2015 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | 5 | US-2015-108966 | 10/11/2015 | 10/18/2015 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
5 rows × 21 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9994 entries, 0 to 9993 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Row ID 9994 non-null int64 1 Order ID 9994 non-null object 2 Order Date 9994 non-null object 3 Ship Date 9994 non-null object 4 Ship Mode 9994 non-null object 5 Customer ID 9994 non-null object 6 Customer Name 9994 non-null object 7 Segment 9994 non-null object 8 Country 9994 non-null object 9 City 9994 non-null object 10 State 9994 non-null object 11 Postal Code 9994 non-null int64 12 Region 9994 non-null object 13 Product ID 9994 non-null object 14 Category 9994 non-null object 15 Sub-Category 9994 non-null object 16 Product Name 9994 non-null object 17 Sales 9994 non-null float64 18 Quantity 9994 non-null int64 19 Discount 9994 non-null float64 20 Profit 9994 non-null float64 dtypes: float64(3), int64(3), object(15) memory usage: 1.6+ MB
df.describe()
| Row ID | Postal Code | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|
| count | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 |
| mean | 4997.500000 | 55190.379428 | 229.858001 | 3.789574 | 0.156203 | 28.656896 |
| std | 2885.163629 | 32063.693350 | 623.245101 | 2.225110 | 0.206452 | 234.260108 |
| min | 1.000000 | 1040.000000 | 0.444000 | 1.000000 | 0.000000 | -6599.978000 |
| 25% | 2499.250000 | 23223.000000 | 17.280000 | 2.000000 | 0.000000 | 1.728750 |
| 50% | 4997.500000 | 56430.500000 | 54.490000 | 3.000000 | 0.200000 | 8.666500 |
| 75% | 7495.750000 | 90008.000000 | 209.940000 | 5.000000 | 0.200000 | 29.364000 |
| max | 9994.000000 | 99301.000000 | 22638.480000 | 14.000000 | 0.800000 | 8399.976000 |
df['Ship Date']= pd.to_datetime(df['Ship Date'])
df['Order Date']= pd.to_datetime(df['Order Date'])
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9994 entries, 0 to 9993 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Row ID 9994 non-null int64 1 Order ID 9994 non-null object 2 Order Date 9994 non-null datetime64[ns] 3 Ship Date 9994 non-null datetime64[ns] 4 Ship Mode 9994 non-null object 5 Customer ID 9994 non-null object 6 Customer Name 9994 non-null object 7 Segment 9994 non-null object 8 Country 9994 non-null object 9 City 9994 non-null object 10 State 9994 non-null object 11 Postal Code 9994 non-null int64 12 Region 9994 non-null object 13 Product ID 9994 non-null object 14 Category 9994 non-null object 15 Sub-Category 9994 non-null object 16 Product Name 9994 non-null object 17 Sales 9994 non-null float64 18 Quantity 9994 non-null int64 19 Discount 9994 non-null float64 20 Profit 9994 non-null float64 dtypes: datetime64[ns](2), float64(3), int64(3), object(13) memory usage: 1.6+ MB
df['Month of order']=df['Order Date'].dt.month
df['Year of order']=df['Order Date'].dt.year
df['Day of order']=df['Order Date'].dt.day_name()
df.head()
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | ... | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | Month of order | Year of order | Day of order | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | CA-2016-152156 | 2016-11-08 | 2016-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 | 11 | 2016 | Tuesday |
| 1 | 2 | CA-2016-152156 | 2016-11-08 | 2016-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 | 11 | 2016 | Tuesday |
| 2 | 3 | CA-2016-138688 | 2016-06-12 | 2016-06-16 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | ... | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 | 6 | 2016 | Sunday |
| 3 | 4 | US-2015-108966 | 2015-10-11 | 2015-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 | 10 | 2015 | Sunday |
| 4 | 5 | US-2015-108966 | 2015-10-11 | 2015-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 | 10 | 2015 | Sunday |
5 rows × 24 columns
monthly_sales=df.groupby('Month of order')['Sales'].sum().reset_index()
fig=px.line(monthly_sales,x="Month of order", y="Sales",title="Monthly Sales Analysis")
fig.show()
In the month of November, the sales are maximun and in month of February the sales are minimum.
category_sales=df.groupby('Category')['Sales'].sum().reset_index()
fig=px.pie(category_sales,values="Sales", names='Category',title="Sales Analysis by Category", hole=0.1)
fig.update_traces(textposition="inside", textinfo="percent+label")
fig.show()
Maximum sales is observed in the technology category.
subcategory_sales=df.groupby('Sub-Category')['Sales'].sum().reset_index()
fig=px.bar(subcategory_sales, x='Sub-Category',y="Sales", title="Sales Analysis by Sub Category")
#fig.update_traces(textposition="inside", textinfo="percent+label")
fig.show()
Maximum sales are observed in the chairs and phones while fasteners have minimum sales.
monthly_sales=df.groupby('Month of order')['Profit'].sum().reset_index()
fig=px.line(monthly_sales,x="Month of order", y="Profit",title="Monthly Profit Analysis")
fig.show()
Maximum profit is in december while minimum profit is in january.
top_products = df.groupby('Product Name')['Sales'].sum().reset_index().sort_values(by='Sales', ascending=False).head(10)
fig = px.bar(top_products, x='Product Name', y='Sales', title='Top 10 Products by Revenue')
fig.show()
monthly_units = df.groupby('Month of order')['Quantity'].sum().reset_index()
fig = px.line(monthly_units, x='Month of order', y='Quantity', title='Monthly Trend of Units Sold')
fig.show()
In month of November maximum quantity is sold.
yearly_sales_growth = df.groupby('Year of order')['Sales'].sum().reset_index()
fig = px.bar(yearly_sales_growth, x="Year of order", y="Sales", title='Year-over-Year Sales Growth')
fig.update_layout(xaxis_type='category')
fig.show()
In the year of 2017 maximum sales are observed.
region_contrib = df.groupby('Region')['Sales'].sum().reset_index()
region_contrib['Sales_Percentage'] = round(region_contrib['Sales'] / (region_contrib['Sales'].sum()) * 100, 2)
fig = px.pie(region_contrib, names='Region', values='Sales_Percentage', title='Region-wise Sales Contribution (%)')
fig.update_traces(textposition="inside", textinfo="percent+label")
fig.show()
In the west region the total sales percentage is maximum which is about 31.6%
p_c=df.groupby('Category')['Profit'].sum().reset_index()
fig=px.bar(p_c, x="Category",y="Profit", title="Profit Analysis by Category")
fig.show()
In the technology category the profit is maximum and in the Furniture category the profit is minimum.
p_sc=df.groupby('Sub-Category')['Profit'].sum().reset_index()
fig=px.bar(p_sc, x="Sub-Category",y="Profit", title="Profit Analysis by Sub Category")
fig.show()
In the Copiers category the profit is maximum and in the Tables category the profit is minimum which is negative.
sp_segment= df.groupby('Segment').agg({'Sales':sum, 'Profit':sum}).reset_index()
cp=colors.qualitative.Pastel
fig=go.Figure()
fig.add_trace(go.Bar(x=sp_segment['Segment'],
y=sp_segment['Sales'],
name='Sales',
marker_color=cp[2]))
fig.add_trace(go.Bar(x=sp_segment['Segment'],
y=sp_segment['Profit'],
name='Profit',
marker_color=cp[4]))
fig.update_layout(title='sales and profit analysis by segment',
xaxis_title='Customer Segment', yaxis_title='Amount')
fig.show()
The consumers provided the maximum profit in sales about 1.1M and Home officers provided the minimum profit about 0.4M.
sp_segment = df.groupby('Segment').agg({'Sales': 'sum', 'Profit': 'sum'}).reset_index()
sp_segment['Sales_to_Profit_Ratio'] = sp_segment['Sales'] / sp_segment['Profit']
print(sp_segment[['Segment', 'Sales_to_Profit_Ratio']])
Segment Sales_to_Profit_Ratio 0 Consumer 8.659471 1 Corporate 7.677245 2 Home Office 7.125416
The consumer Sales to profit ratio is maximum.
df_sorted = df.sort_values('Order Date')
df_sorted['Sales_MA_3'] = df_sorted['Sales'].rolling(window=3).mean()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_sorted['Order Date'], y=df_sorted['Sales'], mode='lines', name='Actual Sales'))
fig.add_trace(go.Scatter(x=df_sorted['Order Date'], y=df_sorted['Sales_MA_3'], mode='lines', name='3-Month Moving Avg', line=dict(dash='dash')))
fig.update_layout(title='Sales with 3-Month Moving Average')
fig.show()
The 3 month monthly average is mostly less than the actual sales observed.
threshold = df['Sales'].quantile(0.95)
outliers = df[df['Sales'] > threshold]
fig = px.scatter(df, x='Order Date', y='Sales', title='Sales Outliers (Above 95th Percentile)', color=df['Sales'] > threshold)
fig.show()
According to the graph the Bulk order is observed on the date 18 March, 2014.